Exploring the Essemble of Random Forests¶

In [1]:
import numpy as np
import pandas as pd
import os 
import sys
sys.path.append('/Users/krish/Desktop/DYNAMIC MODEL VEGETATION PROJECT/au_dyanamic_vegetation_project/STEP9_DATA_MODELLING_AND_EXPLORATION')
import matplotlib.pyplot as plt
from sklearn.metrics import root_mean_squared_error, mean_squared_error

import seaborn as sns
sns.set()
sns.set_style("whitegrid") 
sns.set_context("paper")
In [2]:
results_dir = 'E:/Krish_New/Dynamic_Vegetation_Project_Storage/Random_Forest_Results_On_Super_Group_Results'
results_dir = 'C:/Users/krish/Desktop/DYNAMIC MODEL VEGETATION PROJECT/au_dyanamic_vegetation_project/RESULTS/Random_Forest_Results_On_Super_Group_Results_new'
directory = 'C:/Users/krish/Desktop/DYNAMIC MODEL VEGETATION PROJECT/au_dyanamic_vegetation_project/DATASETS/MODELLED_TRAINING_DATA'
plots_dir = 'C:/Users/krish/Desktop/DYNAMIC MODEL VEGETATION PROJECT/Thesis/Plots_For_Thesis/Chapter 3'
In [3]:
SEASONAL_FEATURES = ['photoperiod', 'photoperiod_gradient']

PRECIP_FEATURES = ['precip_30', 'precip_90', 'precip_180', 
                   'precip_365', 'precip_730', 'precip_1095', 
                   'precip_1460']

MEAN_ANNUAL_CLIMATE_FEATURES = ['MAT', 'MAP']

TEMP_FEATURES = ['tmax_lag', 'tmax_7', 'tmax_14', 
                 'tmax_30', 'tmin_lag', 'tmin_7', 
                 'tmin_14', 'tmin_30']

VPD_FEATURES = ['VPD_lag','VPD_7', 'VPD_14',
                'VPD_30']

FIRE_FEATURES = ['days_since_fire', 'fire_severity']

CO2_FEATURES = ['CO2']

SOIL_FEATURES = ['SLGA_1','SLGA_2','SLGA_3', 'DER_000_999'] # the soil attributes to include

TOPOGRAPHIC_FEATURES = ['aspect_1s', 'twi_1s']

FEATURES =  SEASONAL_FEATURES + PRECIP_FEATURES + VPD_FEATURES + FIRE_FEATURES + CO2_FEATURES + TEMP_FEATURES + SOIL_FEATURES + TOPOGRAPHIC_FEATURES

TEMPORAL_FEATURES = SEASONAL_FEATURES + PRECIP_FEATURES + VPD_FEATURES + FIRE_FEATURES + CO2_FEATURES + TEMP_FEATURES
SPATIAL_FEATURES = SOIL_FEATURES + TOPOGRAPHIC_FEATURES + MEAN_ANNUAL_CLIMATE_FEATURES
In [4]:
def plotPredictions(df, TARGET, n_folds = 10, directory_plot_output = '', msg = '', split = '', fire_split = ''):
    fig, ax = plt.subplots(nrows = 3, figsize = (15,10))
    fig.suptitle(msg, fontsize=30)
    
    for i,v in enumerate(TARGET):
        
        for folder_num in range(n_folds):
            df[f'{v}_prediction_{folder_num + 1}'].plot(ax=ax[i], ylim = (0,100), label = f'Modelled {v.split("_")[0]}')
            
        df[v].plot(ax=ax[i], color = 'black', label = f'Observed {v.split("_")[0]}')
        ax[i].legend()
        ax[i].grid(True)
        if split:
            for s in split:
                ax[i].axvline(s, color='black', ls='--')
        if fire_split:
            for f in fire_split:
                ax[i].axvline(f, color='red', ls='--')

def plotMedianPredictions(df, TARGET, n_folds = 10, directory_plot_output = '', msg = '', split = '', fire_split = ''):
    # Plot the mean/median
    fig, ax = plt.subplots(nrows = 3, figsize = (15,10), sharex = True)
    fig.suptitle(msg, fontsize=30)
    
    for i,v in enumerate(TARGET):
        df[f'{v.split("_")[0]}_median'].plot(ax=ax[i], ylim = (0,100), label = f'Modelled {v.split("_")[0]} median')
        ax[i].fill_between(df.index, df[f'{v.split("_")[0]}_lower'].values,  df[f'{v.split("_")[0]}_upper'].values, color='blue', alpha=0.3, label='2.5%-97.5% Quantiles')
        df[v].plot(ax=ax[i], color = 'black', label = f'Observed {v.split("_")[0]}')
        
        ax[i].legend()
        ax[i].grid(True)
        if split:
            for s in split:
                ax[i].axvline(s, color='black', ls='--')
        if fire_split:
            for f in fire_split:
                ax[i].axvline(f, color='red', ls='--')
    
    
    if directory_plot_output:
        plt.savefig(fname = directory_plot_output)
In [5]:
super_group_list = ['Desert Chenopod', 'Desert Forb', 'Desert Hummock.grass',
       'Desert Shrub', 'Desert Tree.Palm', 'Desert Tussock.grass',
       'Temp/Med Shrub', 'Temp/Med Tree.Palm', 'Temp/Med Tussock.grass',
       'Tropical/Savanna Tree.Palm', 'Tropical/Savanna Tussock.grass']

Get the Variable Importance¶

In [6]:
chosen_super_group = super_group_list[5]
super_group_folder_name = '_'.join(chosen_super_group.split('/')) 
print(f'Looking at {super_group_folder_name}')
Looking at Desert Tussock.grass
In [7]:
var_path = f'{results_dir}/{super_group_folder_name}/Results/Variable_Importances'
test_pred_path = f'{results_dir}/{super_group_folder_name}/Results/Test_Predictions/test_predictions.csv'
In [8]:
fig, axes = plt.subplots(nrows=4, ncols=3, figsize=(15, 12))

for i, ax in enumerate(axes.flat):
    
    if i < 10:
        folder_num = i + 1
        per_importance = pd.read_csv(f'{var_path}/KFold_{folder_num}_RF_VariableImportance.csv', index_col = 0)
        per_importance['importances_mean'].plot.barh(ax = ax)
        #ax.grid(True)
        ax.set_xlabel('Mean Gain in MSE')
        ax.title.set_text(f'Model {folder_num}')
    else:
        ax.axis('off') 

plt.tight_layout()
fig.savefig(f'{plots_dir}/{super_group_folder_name}_Individual_Model_Importances.png')
No description has been provided for this image

Aggregating the Group Variable Means by the average score¶

In [9]:
number_of_folds = 10
variable_common_rank = dict() 

# Give features that were not in the model a score of 0 

for f in range(number_of_folds):
    folder_num = f + 1
    per_importance = pd.read_csv(f'{var_path}/KFold_{folder_num}_RF_VariableImportance.csv')
    per_importance = per_importance.sort_values('importances_mean', ascending = False).reset_index(drop = True)
    per_importance['rank'] = per_importance.index
    
    feature_track = []
    for i, row in per_importance.iterrows():
        var = row['Unnamed: 0']
        
        rank = row['importances_mean']
        #print(var)
        if var in variable_common_rank.keys():
            variable_common_rank[var].append(rank)
        else:
             variable_common_rank[var] = [rank]
                
        feature_track.append(var)       
    
    features_not_found = []
    for var in FEATURES:
        if var not in feature_track:

            if var in variable_common_rank.keys():
                variable_common_rank[var].append(0)
            else:
                variable_common_rank[var] = [0]
            features_not_found.append(var)
    
    print(f'{features_not_found} was not found in Model {f+1}')
['precip_30', 'precip_90', 'precip_180', 'precip_730', 'precip_1095', 'precip_1460', 'VPD_lag', 'VPD_14', 'days_since_fire', 'fire_severity', 'tmax_lag', 'tmax_7', 'tmax_14', 'tmax_30', 'tmin_14', 'tmin_30', 'SLGA_3', 'DER_000_999', 'twi_1s'] was not found in Model 1
['photoperiod', 'precip_30', 'precip_90', 'precip_180', 'precip_365', 'precip_730', 'precip_1095', 'precip_1460', 'VPD_lag', 'VPD_7', 'VPD_14', 'days_since_fire', 'fire_severity', 'tmax_lag', 'tmax_7', 'tmax_30', 'tmin_lag', 'tmin_7', 'tmin_30', 'SLGA_3', 'DER_000_999', 'aspect_1s', 'twi_1s'] was not found in Model 2
['precip_90', 'precip_365', 'precip_730', 'VPD_7', 'VPD_14', 'VPD_30', 'days_since_fire', 'fire_severity', 'tmax_7', 'tmax_14', 'tmax_30', 'tmin_lag', 'tmin_7', 'tmin_14', 'tmin_30', 'DER_000_999', 'aspect_1s', 'twi_1s'] was not found in Model 3
['photoperiod', 'precip_30', 'precip_90', 'precip_180', 'precip_730', 'precip_1095', 'precip_1460', 'VPD_lag', 'VPD_14', 'VPD_30', 'fire_severity', 'tmax_lag', 'tmax_7', 'tmax_14', 'tmin_lag', 'tmin_7', 'tmin_14', 'twi_1s'] was not found in Model 4
['photoperiod_gradient', 'precip_180', 'precip_730', 'precip_1095', 'precip_1460', 'VPD_lag', 'VPD_7', 'VPD_14', 'days_since_fire', 'fire_severity', 'CO2', 'tmax_lag', 'tmax_7', 'tmax_14', 'tmax_30', 'tmin_7', 'tmin_14', 'tmin_30', 'SLGA_3', 'aspect_1s', 'twi_1s'] was not found in Model 5
['photoperiod', 'VPD_lag', 'VPD_7', 'VPD_14', 'days_since_fire', 'fire_severity', 'CO2', 'tmax_lag', 'tmax_7', 'tmax_14', 'tmax_30', 'tmin_lag', 'tmin_7', 'tmin_30', 'aspect_1s', 'twi_1s'] was not found in Model 6
['photoperiod_gradient', 'precip_180', 'precip_365', 'precip_1460', 'VPD_14', 'days_since_fire', 'fire_severity', 'tmax_lag', 'tmax_7', 'tmax_14', 'tmin_lag', 'tmin_7', 'tmin_14', 'SLGA_2', 'SLGA_3', 'aspect_1s', 'twi_1s'] was not found in Model 7
['precip_365', 'precip_730', 'precip_1095', 'VPD_lag', 'VPD_7', 'VPD_14', 'VPD_30', 'days_since_fire', 'fire_severity', 'tmax_lag', 'tmax_7', 'tmax_14', 'tmax_30', 'tmin_lag', 'tmin_7', 'SLGA_3', 'DER_000_999', 'aspect_1s', 'twi_1s'] was not found in Model 8
['photoperiod', 'precip_90', 'precip_180', 'precip_730', 'precip_1095', 'precip_1460', 'VPD_7', 'VPD_14', 'VPD_30', 'days_since_fire', 'fire_severity', 'tmax_lag', 'tmax_7', 'tmax_30', 'tmin_lag', 'tmin_7', 'SLGA_2', 'DER_000_999'] was not found in Model 9
['precip_30', 'precip_90', 'precip_180', 'precip_730', 'precip_1095', 'precip_1460', 'VPD_lag', 'VPD_7', 'VPD_14', 'VPD_30', 'days_since_fire', 'fire_severity', 'tmax_lag', 'tmax_7', 'tmax_14', 'tmax_30', 'tmin_lag', 'tmin_7', 'tmin_14', 'SLGA_3', 'DER_000_999', 'aspect_1s', 'twi_1s'] was not found in Model 10
In [10]:
collapsed_importances = dict()
for key, value in variable_common_rank.items():
    collapsed_importances[key] =  np.mean(value)
    
collapsed_importance = pd.DataFrame.from_dict(collapsed_importances, orient = 'index')  
collapsed_importance = collapsed_importance.sort_values(0)
In [11]:
number_of_folds = 10
fig, axes = plt.subplots(ncols= 2, figsize = (10,5))


spatial = collapsed_importance.iloc[collapsed_importance.index.isin(SPATIAL_FEATURES)]
spatial[0].plot.barh(ax = axes[0], legend=False)

temporal = collapsed_importance.iloc[collapsed_importance.index.isin(TEMPORAL_FEATURES)]
temporal.plot.barh(ax = axes[1], legend=False)

for ax in axes.flat:
    ax.grid(True)
    ax.set_xlabel('Mean Gain in MSE')

fig.suptitle(f'{super_group_folder_name}', fontsize = 15)
plt.tight_layout()
fig.savefig(f'{plots_dir}/{super_group_folder_name}_Aggregated_Model_Importances.png')
No description has been provided for this image

Assessing the Predictions¶

In [12]:
number_of_folds = 10
test_set = pd.read_csv(test_pred_path, parse_dates = ['time']).set_index('time').sort_values('time')

predictor_names_pv = [f'pv_filter_prediction_{j + 1}' for j in range(number_of_folds)]
pv_summaries = test_set[predictor_names_pv].T.describe(percentiles = [0.025, 0.5, 0.975]).T
test_set['pv_median'] = pv_summaries['50%'].values
test_set['pv_mean'] = pv_summaries['mean'].values
test_set['pv_lower'] = pv_summaries['2.5%'].values
test_set['pv_upper'] = pv_summaries['97.5%'].values

predictor_names_pv = [f'npv_filter_prediction_{j + 1}' for j in range(number_of_folds)]
pv_summaries = test_set[predictor_names_pv].T.describe(percentiles = [0.025, 0.5, 0.975]).T
test_set['npv_median'] = pv_summaries['50%'].values
test_set['npv_mean'] = pv_summaries['mean'].values
test_set['npv_lower'] = pv_summaries['2.5%'].values
test_set['npv_upper'] =  pv_summaries['97.5%'].values

predictor_names_pv = [f'bs_filter_prediction_{j + 1}' for j in range(number_of_folds)]
pv_summaries = test_set[predictor_names_pv].T.describe(percentiles = [0.025, 0.5, 0.975]).T
test_set['bs_median'] = pv_summaries['50%'].values
test_set['bs_mean'] = pv_summaries['mean'].values
test_set['bs_lower'] = pv_summaries['2.5%'].values
test_set['bs_upper'] = pv_summaries['97.5%'].values
In [13]:
print('Mean MSEs')
pv_mean_rmse = round(root_mean_squared_error(test_set['pv_mean'], test_set['pv_filter']),3)
npv_mean_rmse = round(root_mean_squared_error(test_set['npv_mean'], test_set['npv_filter']),3)
bs_mean_rmse = round(root_mean_squared_error(test_set['bs_mean'], test_set['bs_filter']),3)

print(f'PV: {pv_mean_rmse}')
print(f'NPV: {npv_mean_rmse}')
print(f'BS: {bs_mean_rmse}')
Mean MSEs
PV: 10.251
NPV: 11.095
BS: 17.298
In [14]:
TARGET = ['pv_filter', 'npv_filter', 'bs_filter'] # Only needed target variables 
In [15]:
sites_unique = np.unique(test_set['site_location_name'])


for sit in sites_unique:
    
    site_specific = test_set[test_set['site_location_name'] == sit]
    
    
    site_pv_rmse = round(root_mean_squared_error(site_specific['pv_median'], site_specific['pv_filter']),2)
    site_npv_rmse = round(root_mean_squared_error(site_specific['npv_median'], site_specific['npv_filter']),2)
    site_bs_rmse = round(root_mean_squared_error(site_specific['bs_median'], site_specific['bs_filter']),2)
    
    #plotPredictions(test_set[test_set['site_location_name'] == sit], n_folds = number_of_folds,
    #                TARGET = TARGET, msg = f'{sit}, [pv_rmse: {site_pv_rmse}, npv_rmse: {site_npv_rmse}, bs_rmse: {site_bs_rmse}]')
    #
    plotMedianPredictions(test_set[test_set['site_location_name'] == sit], n_folds = number_of_folds,
                          TARGET = TARGET, msg = f'{sit}, [pv_rmse: {site_pv_rmse}, npv_rmse: {site_npv_rmse}, bs_rmse: {site_bs_rmse}]',
                          directory_plot_output = f'{plots_dir}/{super_group_folder_name}_{sit}_Median_Predictions.png')
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Inspect Linear Alignment¶

In [16]:
fig, axes = plt.subplots(nrows = 2, ncols= 2, figsize = (10,10))
fractions = ['pv', 'npv', 'bs']

for i, ax in enumerate(axes.flat):
    
    if i < 3:
        one_to_one = [i for i in range(101)]
        sns.regplot(data = test_set, x = f'{fractions[i]}_filter', y = f'{fractions[i]}_median', ax = ax, line_kws=dict(color="r"))
        ax.plot(one_to_one,one_to_one, color = 'black')
        ax.grid(True)
        ax.set_xlabel(f'{fractions[i].upper()}')
        ax.set_xlabel(f'Modelled {fractions[i].upper()}')
        
        ax.set_xlim([0, 100])
        ax.set_ylim([0, 100])
        ax.set_aspect('equal')

    else:
        ax.axis('off') 
No description has been provided for this image